In [20]:
from torchvision import datasets
from torch.utils.data import DataLoader
from torch import optim
from torchvision import transforms
import torch
import matplotlib.pyplot as plt
from torch.utils.data import random_split
import random
from skimage.transform import resize

Explainable AI¶

In this assignment I explore explainable AI, starting with small-scale experiments on MNIST using a NumPy-based model. The goal is to build a foundation for understanding how model decisions can be interpreted. Gradually, I move toward implementing a standard method (Grad-CAM), and then scale the approach to more complex models and datasets using PyTorch.

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "mps")
%load_ext autoreload
%autoreload 2
In [3]:
transform = transforms.Compose([
transforms.ToTensor(),
# transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

batch_size = 128

train_subset, _ = random_split(train_dataset, [1000, len(train_dataset) - 1000])
test_subset, _ = random_split(test_dataset, [200, len(test_dataset) - 200])

# Create DataLoader with subset
train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
In [4]:
classes = train_dataset.classes

samples = {}
for img, label in train_dataset:
    if label not in samples:
        samples[label] = img
    if len(samples) == len(classes):
        break

# Plot the images.
fig, axs = plt.subplots(1, len(classes), figsize=(15, 4))
for label, ax in enumerate(axs):
    img_np = samples[label].permute(1, 2, 0).numpy()
    ax.imshow(img_np, cmap='gray')
    ax.set_title(classes[label])
    ax.axis('off')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [5]:
def softmax(x):
    x_shifted = x - np.max(x, axis=1, keepdims=True)
    exps = np.exp(x_shifted)
    return exps / np.sum(exps, axis=1, keepdims=True)


def l2_regularization(W, reg_strength):
    loss = np.sum(np.square(W)) * reg_strength
    grad = 2 * reg_strength * W
    return loss, grad


def softmax_with_cross_entropy(predictions, target_index):
    batch_size = predictions.shape[0]
    probs = softmax(predictions)
    correct_probs = probs[np.arange(batch_size), target_index]
    loss = -np.sum(np.log(correct_probs)) / batch_size  # Average loss
    d_pred = probs.copy()
    d_pred[np.arange(batch_size), target_index] -= 1
    d_pred /= batch_size  # Normalize gradient
    return loss, d_pred


class Param:
    def __init__(self, value):
        self.value = value
        self.grad = np.zeros_like(value)


class ReLULayer:
    def __init__(self):
        self.mask = None

    def forward(self, X):
        self.mask = X > 0
        X = X * self.mask
        return X

    def backward(self, d_out):
        d_result = d_out * self.mask

        return d_result

    def params(self):
        return {}


class FullyConnectedLayer:
    def __init__(self, n_input, n_output):
        self.W = Param(0.001 * np.random.randn(n_input, n_output))
        self.B = Param(0.001 * np.random.randn(1, n_output))
        self.X = None

    def forward(self, X):
        self.X = X

        return X @ self.W.value + self.B.value

    def backward(self, d_out):
        d_input = d_out @ self.W.value.T

        d_W = self.X.T @ d_out
        self.W.grad += d_W

        d_B = np.sum(d_out, axis=0, keepdims=True)
        self.B.grad += d_B

        return d_input

    def params(self):
        return {'W': self.W, 'B': self.B}

import numpy as np

class ConvolutionalLayer:
    def __init__(self, in_channels, out_channels,
                 filter_size, padding):
        self.filter_size = filter_size
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.W = Param(
            np.random.randn(filter_size, filter_size, in_channels, out_channels) * np.sqrt(
                2.0 / (filter_size * filter_size * in_channels))
        )

        self.B = Param(np.zeros(out_channels))

        self.padding = padding

        self.last_X = None

        self.feature_maps = None
        self.gradients = None

    def forward(self, X):
        # get shape of the input tensor
        batch_size, in_channels, height, width = X.shape

        # compute shape of output tensor
        out_height = height - self.filter_size + 1 + 2 * self.padding
        out_width = width - self.filter_size + 1 + 2 * self.padding

        # pad the input tensor
        X = np.pad(X, ((0, 0), (0, 0), (self.padding,) * 2, (self.padding,) * 2))

        # save last input for backward pass
        self.last_X = X

        # create zeros tensor for result
        result = np.zeros((batch_size, self.out_channels, out_height, out_width))

        # reshape weights to use matrix multiplication trick
        weights = self.W.value.reshape(self.in_channels * self.filter_size * self.filter_size, self.out_channels)

        # iterate each pixel in output tensor
        for y in range(out_height):
            for x in range(out_width):
                # take the perception widow of the output pixel
                patch = X[:, :, y:y + self.filter_size, x:x + self.filter_size]

                # unwrap patch to use matrix multiplication trick
                patch_flat = patch.reshape(batch_size, self.in_channels * self.filter_size * self.filter_size)

                # convolution operation
                res = patch_flat @ weights

                # add bias to result
                res += self.B.value

                # add pixels to result tensor
                result[:, :, y, x] = res

        self.feature_maps = result.copy()
        return result

    def backward(self, d_out):
        self.gradients = d_out.copy()
        # get shape of last input passed to forward pass
        batch_size, in_channels, height, width = self.last_X.shape

        # get shape of the received gradient
        _, out_channels, out_height, out_width = d_out.shape

        # create d_input with proper shape
        d_X = np.zeros_like(self.last_X)

        # get and reshape weights to use in calculations
        weights = self.W.value.reshape(self.in_channels * self.filter_size * self.filter_size, self.out_channels)

        for y in range(out_height):
            for x in range(out_width):
                # take the gradient patch (batch_size, out_channels)
                gradient_patch = d_out[:, :, y, x]

                # get and reshape input patch to (batch_size, self.in_channels*self.filter_size*self.filter_size)
                input_patch = self.last_X[:, :, y:y + self.filter_size, x:x + self.filter_size]
                input_patch_flat = input_patch.reshape(batch_size,
                                                       self.in_channels * self.filter_size * self.filter_size)

                # d_X to get gradient with respect to input -> d_out + weights

                d_input_flat = gradient_patch @ weights.T

                d_input_patch = d_input_flat.reshape(batch_size, in_channels, self.filter_size, self.filter_size)

                d_X[:, :, y:y + self.filter_size, x:x + self.filter_size] += d_input_patch

                # d_w gradient with respect to weights -> d_out + inputs
                d_flat_w = input_patch_flat.T @ gradient_patch

                d_w = d_flat_w.reshape(self.filter_size, self.filter_size, self.in_channels, self.out_channels)

                self.W.grad += d_w

                # d_b gradient with respect to bias term -> d_out

                d_b = np.sum(gradient_patch, axis=0)

                self.B.grad += d_b

        # return d_X without padding
        return d_X[:, :, self.padding:-self.padding, self.padding:-self.padding] if self.padding > 0 else d_X

    def params(self):
        return {'W': self.W, 'B': self.B}
class MaxPoolingLayer:
    def __init__(self, pool_size, stride):
        self.pool_size = pool_size
        self.stride = stride
        self.X = None

    def forward(self, X):
        batch_size, channels, height, width = X.shape

        self.X = X

        # compute output tensor size
        out_height = (height - self.pool_size) // self.stride + 1
        out_width = (width - self.pool_size) // self.stride + 1

        # init resulting tensor with proper shape
        result = np.zeros((batch_size, channels, out_height, out_width))

        for y in range(out_height):
            for x in range(out_width):
                start_y = y * self.stride
                start_x = x * self.stride

                # take input patch
                patch = X[:, :, start_y:start_y + self.pool_size, start_x:start_x + self.pool_size]

                # get maximum over axis (1,2) means for each batch(0) and chanel(3) get the maximum of matrix (pool_size, pool_size) that stores actual pixel values
                patch_max = np.max(patch, axis=(2, 3))

                # assign maximum over each batch and chanel to the point in result tensor
                result[:, :, y, x] = patch_max

        return result

    def backward(self, d_out):
        batch_size, channels, height, width = self.X.shape

        d_input = np.zeros_like(self.X)

        out_width = (width - self.pool_size) // self.stride + 1
        out_height = (height - self.pool_size) // self.stride + 1

        for y in range(out_height):
            for x in range(out_width):
                input_patch = self.X[:, :, y * self.stride:y * self.stride + self.pool_size,
                              x * self.stride:x * self.stride + self.pool_size]

                # Flatten each patch for max index computation
                input_patch_reshaped = input_patch.reshape(batch_size, channels, -1)

                # Find the index of the maximum value in each patch
                max_idx_local = np.argmax(input_patch_reshaped, axis=2)

                # Convert flat indices back to 2D indices
                row_idx_local, col_idx_local = np.unravel_index(max_idx_local, (self.pool_size, self.pool_size))

                # Convert local indices to global indices in the input tensor
                row_idx_global = row_idx_local + y * self.stride
                col_idx_global = col_idx_local + x * self.stride

                # Batch and channel indices for broadcasting
                batch_idx = np.arange(batch_size)[:, None]
                ch_idx = np.arange(channels)

                # Accumulate gradient values at the positions of the max values
                d_input[batch_idx, ch_idx, row_idx_global, col_idx_global] += d_out[:, :, y, x]

        return d_input

    def params(self):
        return {}


class Flattener:
    def __init__(self):
        self.X_shape = None

    def forward(self, X):
        batch_size, channels, height, width = X.shape

        self.X_shape = X.shape

        return X.reshape(batch_size, -1)

    def backward(self, d_out):
        return d_out.reshape(self.X_shape)

    def params(self):
        return {}

Implementing Grad-CAM in Custom NumPy Model¶

In this section I extend my NumPy-based convolutional model with support for Grad-CAM visualizations. Grad-CAM allows us to understand which spatial regions of the input were most important for a model’s decision, by tracing back from output predictions to the internal convolutional features. I implemented three versions of Grad-CAM progressively, starting from basic intuition to the original algorithm.

grad_cam_v1 — Gradient w.r.t. Input¶

In the first version, I compute the gradient of the final logits with respect to the input image. This method is conceptually aligned with the idea of derivatives which actually tells how much a change in that input pixel would affect the final prediction(logits).

Steps:

  • Perform a forward pass to get the prediction logits.
  • Backpropagate from the logits to the input.
  • Take the absolute value of the input gradients and aggregate over channels to form a heatmap.
  • Visualize it alongside the input.

This version produces a sensitivity map, but it lacks localization in deeper features and tends to highlight high-frequency noise.

grad_cam_v2 — Target Class Masking¶

In the second version, I introduce masking to isolate the gradient signal for a specific target class. Rather than backpropagating from all outputs (which mixes gradients from multiple classes), I zero out the logits of all other classes and backpropagate only from the one of interest.

Steps:

  • Perform a forward pass to compute logits.
  • Create a mask that keeps only the logit of the target class active.
  • Backpropagate only from the target logit.
  • Compute gradients w.r.t. the input and visualize as in v1.

This approach improves the relevance of the heatmap to the specific class and removes confusion from unrelated outputs.

grad_cam_v3 — Original Grad-CAM¶

The third version follows the original Grad-CAM method more closely. Instead of working with gradients at the input level, we inspect the gradients at a chosen convolutional layer and use them to compute class-specific importance for each feature channel.

Steps:

  • Forward pass with capture of the convolutional feature maps.
  • Backward pass from the target logit to compute gradients w.r.t. those features.
  • Global average pool the gradients to get a weight for each channel.
  • Compute a weighted sum of the feature maps and apply ReLU.
  • Normalize and upsample to input size for visualization.
In [13]:
class Model:
    def __init__(self, layers):
        """
        Initialize the model with a list of layers.
        """
        self.layers = layers
        self.last_output = None

    def forward(self, x):
        """
        Perform a forward pass through all layers.
        """
        for layer in self.layers:
            x = layer.forward(x)

        self.last_output = x
        return x

    def backward(self, grad):
        """
        Backpropagate the gradient through all layers.
        """
        for layer in reversed(self.layers):
            grad = layer.backward(grad)
        return grad

    def grad_cam_v1(self, input_bathed, threshold=0.3):
        """
        Computes gradients of logits w.r.t. input image.
        Visualizes input sensitivity across all channels.
        """

        self.forward(input_bathed)

        input_grad = self.backward(self.last_output)
        self.clear_params()


        alpha = 0.5
        for inp, mask in zip(input_batched, input_grad):
            heatmap = np.max(np.abs(mask), axis=0)

            heatmap_min = np.min(heatmap)
            heatmap_max = np.max(heatmap)
            heatmap = (heatmap - heatmap_min) / (heatmap_max - heatmap_min + 1e-8)

            heatmap = np.where(heatmap < threshold, 0, heatmap)

            inp_img = inp.permute(1, 2, 0).numpy()

            fig, axs = plt.subplots(1, 3, figsize=(12, 4))

            axs[0].imshow(inp_img)
            axs[0].imshow(heatmap, alpha=alpha)
            axs[0].set_title("Overlay")
            axs[0].axis('off')

            axs[1].imshow(inp_img)
            axs[1].set_title("Original")
            axs[1].axis('off')

            axs[2].imshow(heatmap, cmap='jet')
            axs[2].set_title("Heatmap")
            axs[2].axis('off')

            plt.tight_layout()
            plt.show()

        return input_grad


    def grad_cam_v2(self, input_bathed, target_class = None, threshold=0):
        """
        Computes gradients of only the target class logit w.r.t. input image.
        Focuses attention on class-specific sensitivity regions.
        """

        outputs = self.forward(input_bathed)

        batch_size = input_bathed.shape[0]

        masked_output = np.zeros_like(outputs)
        masked_output[np.arange(batch_size), target_class] = outputs[np.arange(batch_size), target_class]

        input_grad = self.backward(masked_output)
        self.clear_params()

        alpha = 0.5
        for inp, mask in zip(input_batched, input_grad):
            heatmap = np.max(np.abs(mask), axis=0)

            heatmap_min = np.min(heatmap)
            heatmap_max = np.max(heatmap)
            heatmap = (heatmap - heatmap_min) / (heatmap_max - heatmap_min + 1e-8)

            heatmap = np.where(heatmap < threshold, 0, heatmap)

            inp_img = inp.permute(1, 2, 0).numpy()

            fig, axs = plt.subplots(1, 3, figsize=(12, 4))

            axs[0].imshow(inp_img)
            axs[0].imshow(heatmap, alpha=alpha,)
            axs[0].set_title("Overlay")
            axs[0].axis('off')

            axs[1].imshow(inp_img)
            axs[1].set_title("Original")
            axs[1].axis('off')

            axs[2].imshow(heatmap, cmap='jet')
            axs[2].set_title("Heatmap")
            axs[2].axis('off')

            plt.tight_layout()
            plt.show()

        return input_grad

    def grad_cam_v3(self, input_batched, target_class=None, conv_layer_index=-1, threshold=0.0, alpha=0.5):
        """
        Implements the original Grad-CAM algorithm.
        Uses gradients of feature maps in a selected convolutional layer
        to compute class-specific spatial importance heatmaps.
        """

        # Forward pass while capturing feature maps from the chosen conv layer.
        feature_maps = None
        target_conv_layer = None
        x = input_batched
        for idx, layer in enumerate(self.layers):
            x = layer.forward(x)
            # If this is the selected convolutional layer:
            if idx == conv_layer_index and hasattr(layer, 'feature_maps'):
                feature_maps = layer.feature_maps.copy()  # [batch_size, channels, conv_h, conv_w]
                target_conv_layer = layer

        outputs = x
        batch_size = outputs.shape[0]

        target_class = np.array(target_class)

        # Mask outputs so that only the target class logit contributes.
        masked_output = np.zeros_like(outputs)
        masked_output[np.arange(batch_size), target_class] = outputs[np.arange(batch_size), target_class]

        # Backward pass: compute gradients using the masked output.
        self.backward(masked_output)
        self.clear_params()

        # Retrieve gradients from the selected conv layer.
        grads = target_conv_layer.gradients  # shape: [batch_size, channels, conv_h, conv_w]

        # Compute channel-wise weights by global average pooling.
        weights = np.mean(grads, axis=(2, 3))  # shape: [batch_size, channels]

        heatmaps = []
        for i in range(batch_size):
            cam = np.zeros(feature_maps.shape[2:], dtype=np.float32)
            for k in range(feature_maps.shape[1]):
                cam += weights[i, k] * feature_maps[i, k, :, :]
            cam = np.maximum(cam, 0) # Relu
            # Normalize the heatmap.
            cam_min, cam_max = cam.min(), cam.max()
            cam = (cam - cam_min) / (cam_max - cam_min + 1e-8)
            if threshold > 0:
                cam = np.where(cam < threshold, 0, cam)
            heatmaps.append(cam)
        heatmaps = np.array(heatmaps)  # shape: [batch_size, conv_h, conv_w]

        # Upsample the heatmaps to the size of the input images and display.
        for inp, cam in zip(input_batched, heatmaps):
            inp_img = inp.permute(1, 2, 0).detach().cpu().numpy()

            upsampled_cam = resize(cam, (inp_img.shape[0], inp_img.shape[1]), mode='reflect', anti_aliasing=True)
            fig, axs = plt.subplots(1, 3, figsize=(12, 4))
            axs[0].imshow(inp_img)
            axs[0].imshow(upsampled_cam, alpha=alpha)
            axs[0].set_title("Overlay")
            axs[0].axis('off')
            axs[1].imshow(inp_img)
            axs[1].set_title("Original")
            axs[1].axis('off')
            axs[2].imshow(upsampled_cam, cmap='jet')
            axs[2].set_title("Heatmap")
            axs[2].axis('off')
            plt.tight_layout()
            plt.show()

        return heatmaps


    def compute_loss_and_grad(self, predictions, target_index):
        """
        Compute the softmax with cross entropy loss and return both the loss
        and the gradient.
        """
        loss, grad = softmax_with_cross_entropy(predictions, target_index)
        return loss, grad

    def params(self):
        """
        Collects parameters from all layers.
        Returns a dictionary mapping layer names to their parameters.
        """
        parameters = {}
        for idx, layer in enumerate(self.layers):
            for name, param in layer.params().items():
                parameters[f"layer{idx}_{name}"] = param
        return parameters

    def clear_params(self):
        """
        Clear all parameters.
        """
        for param in self.params().values():
            param.grad = np.zeros_like(param.grad)

    def update_params(self, lr):
        """
        Updates all parameters.
        """
        for param in self.params().values():
            param.value -= lr * param.grad
            param.grad = np.zeros_like(param.grad)

    def train(self, lr, epochs, data_loader):
        """
        Trains the model.
        data_loader should yield batches as (x, y) where:
          - x is input with shape (batch_size, input_size)
          - y contains the target class indices for each sample.
        """
        loss_list = []
        for epoch in range(epochs):
            epoch_loss = 0.0
            for x, y in data_loader:
                # Forward pass: compute predictions for the current batch.
                predictions = self.forward(x)

                # Compute loss and gradient at the output.
                loss, grad = self.compute_loss_and_grad(predictions, y)
                epoch_loss += loss

                # Backward pass: propagate gradients through the network.
                self.backward(grad)

                # Update all parameters using gradient descent.
                self.update_params(lr)

            avg_loss = epoch_loss / len(data_loader)
            loss_list.append(avg_loss)
            print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

        # Plot training loss.
        plt.plot(loss_list)
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title('Training Loss')
        plt.show()
In [14]:
class MomentumSGD:
    def __init__(self, momentum=0.9):
        self.momentum = 0.9
        self.velocities = {}

    def update(self, w, d_w, learning_rate, param_name):
        if param_name not in self.velocities:
            self.velocities[param_name] = np.zeros_like(w)

        self.velocities[param_name] = self.momentum * self.velocities[param_name] - learning_rate * d_w

        return w + self.velocities[param_name]

def train_model(model, train_loader, test_loader, epochs, lr, optimizer=MomentumSGD(), save_path=None):
    history = {
        'train_loss': [],
        'train_acc': [],
        'test_loss': [],
        'test_acc': []
    }
    for epoch in range(epochs):
        # Training phase
        model.train_mode = True
        epoch_loss = 0.0
        correct_train = 0
        total_train = 0
        batch_count = 0

        print(f"Epoch {epoch + 1}/{epochs}")
        print("-" * 30)

        for x, y in train_loader:
            # Data conversion and transfer

            x = x.numpy().astype(np.float32)
            y = y.numpy()

            # Forward pass
            predictions = model.forward(x)

            # Loss computation
            loss, grad = softmax_with_cross_entropy(predictions, y)

            # Calculate training accuracy
            pred_classes = np.argmax(predictions, axis=1)
            correct_train += np.sum(pred_classes == y)
            total_train += len(y)

            # Backward pass
            model.backward(grad)

            # Parameter update
            for param_name, param in model.params().items():
                param.value = optimizer.update(param.value, param.grad, lr, param_name)
                param.grad = np.zeros_like(param.grad)

            epoch_loss += loss
            batch_count += 1

        # Calculate training metrics
        avg_train_loss = epoch_loss / batch_count
        train_accuracy = (correct_train / total_train) * 100
        history['train_loss'].append(avg_train_loss)
        history['train_acc'].append(train_accuracy)

        # Testing phase
        model.train_mode = False
        test_loss = 0.0
        correct_test = 0
        total_test = 0
        test_batch_count = 0

        for x, y in test_loader:
            # Data conversion and transfer

            x = x.numpy().astype(np.float32)
            y = y.numpy()

            # Forward pass only (no backward pass during testing)
            predictions = model.forward(x)

            # Calculate test loss
            batch_loss, _ = softmax_with_cross_entropy(predictions, y)
            test_loss += batch_loss

            # Calculate test accuracy
            pred_classes =  np.argmax(predictions, axis=1)
            correct_test += np.sum(pred_classes == y)
            total_test += len(y)

            test_batch_count += 1

        # Calculate test metrics
        avg_test_loss = test_loss / test_batch_count
        test_accuracy = (correct_test / total_test) * 100
        history['test_loss'].append(avg_test_loss)
        history['test_acc'].append(test_accuracy)

        # Print metrics
        print(f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.2f}%")
        print(f"Test Loss: {avg_test_loss:.4f} | Test Acc: {test_accuracy:.2f}%")

    return history
In [15]:
model = Model([
    ConvolutionalLayer(1, 16, 3, 1),  # Input: 1 channel, Output: 16 channels, 3x3 kernel, padding 1
    ReLULayer(),
    ConvolutionalLayer(16, 32, 3, 1),  # 16 -> 32 channels, 3x3 kernel, padding 1
    ReLULayer(),
    MaxPoolingLayer(2, 2),  # 2x2 pooling with stride 2 -> output 32 * 14 * 14

    Flattener(),
    FullyConnectedLayer(32 * 14 * 14, 10),
])
In [16]:
epochs = 6
learning_rate = 0.001
train_history = train_model(model, train_loader, test_loader, epochs, learning_rate)
Epoch 1/6
------------------------------
Train Loss: 2.1801 | Train Acc: 25.20%
Test Loss: 1.9430 | Test Acc: 45.16%
Epoch 2/6
------------------------------
Train Loss: 1.6803 | Train Acc: 59.80%
Test Loss: 1.4002 | Test Acc: 72.31%
Epoch 3/6
------------------------------
Train Loss: 1.1905 | Train Acc: 76.90%
Test Loss: 1.0027 | Test Acc: 79.87%
Epoch 4/6
------------------------------
Train Loss: 0.8693 | Train Acc: 82.10%
Test Loss: 0.7747 | Test Acc: 82.52%
Epoch 5/6
------------------------------
Train Loss: 0.6829 | Train Acc: 84.10%
Test Loss: 0.6389 | Test Acc: 84.64%
Epoch 6/6
------------------------------
Train Loss: 0.5673 | Train Acc: 85.90%
Test Loss: 0.5575 | Test Acc: 85.94%
In [17]:
num_samples = 3

input_batched = torch.stack([train_dataset[i][0] for i in range(num_samples)])

target_class = np.stack([train_dataset[i][1] for i in range(num_samples)])

grad_cam_mask = model.grad_cam_v1(input_batched, threshold=0)[0]
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [18]:
grad_cam_mask = model.grad_cam_v2(input_batched, target_class=target_class, threshold=0)[0]
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [21]:
grad_cam_mask = model.grad_cam_v3(input_batched, target_class=target_class,conv_layer_index=2, threshold=0)[0]
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Results and Reflections¶

As for me, the custom implementation provided more information, particularly on the MNIST dataset. The more straightforward methods (v1 and v2) gave sharper and more intuitive signals about what the model was actually attending to.

For instance, in the case of the digit zero, we could clearly see that the model responds to the circular outline and the empty middle space. These versions made it possible to interpret the prediction by identifying structural elements like angles and boundaries.

On the other hand, the v3 implementation (original Grad-CAM) appeared more abstract. It resembled a probabilistic mask, indicating which parts of the image contributed positively to the target class. While this gave a good sense of class confidence spread, it often felt more like a soft coloring over the image than a precise signal.

This was especially visible with digits like 0 and 5, where the heatmap highlighted confident regions.

For digit 4, the visualization is inconsistent and somewhat buggy. I didn’t investigate this further.

Cat vs Dog Classification with Grad‑CAM (Using PyTorch)¶

In the second part of the assignment, we apply Grad-CAM to a more realistic dataset with higher-resolution images to evaluate its interpretability in more complex visual scenarios.

We use the Cats vs Dogs dataset from Kaggle. The dataset provides a clearer distinction between classes and more natural image variation compared to datasets like MNIST or CIFAR-10.

To handle these images, we define a simple CNN using PyTorch with adaptive pooling to accommodate the larger input size. Grad-CAM is implemented using forward and backward hooks on a convolutional layer, allowing us to visualize which regions of the image contribute most to the model's predictions.

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np

Step 1: Load Dataset¶

Download dataset using Kaggle hub

In [23]:
import kagglehub

path = kagglehub.dataset_download("shaunthesheep/microsoft-catsvsdogs-dataset")

print("Path to dataset files:", path)
/home/may33/miniconda3/envs/cnn/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
Path to dataset files: /home/may33/.cache/kagglehub/datasets/shaunthesheep/microsoft-catsvsdogs-dataset/versions/1
In [24]:
import os

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

class CatDogKaggleDataset(Dataset):
    def __init__(self, root, transform=None):
        self.transform = transform
        self.image_files = []
        self.labels = []

        for folder, label in zip(["Cat", "Dog"], [0, 1]):
            folder_path = os.path.join(root, folder)
            for fname in os.listdir(folder_path):
                if fname.lower().endswith('.jpg'):
                    self.image_files.append(os.path.join(folder_path, fname))
                    self.labels.append(label)

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, index):
        img_path = self.image_files[index]
        img = Image.open(img_path).convert('RGB')
        label = self.labels[index]
        if self.transform:
            img = self.transform(img)
        return img, label

Step 2: Define the CNN Model¶

Our model consists of:

  • Two four layers with ReLU and max pooling.
  • An adaptive average pooling layer to fix the output size.
  • A fully connected layer that outputs logits for 2 classes.

We choose an adaptive pooling size of (7,7) so that the FC layer's input size is 32 * 7 * 7 = 1568.

In [32]:
class CatVsDogCNN(nn.Module):
    def __init__(self):
        super(CatVsDogCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=5, padding=1)

        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, padding=1)

        self.conv3 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=5, padding=1)

        self.pool = nn.MaxPool2d(2, 2)
        # Adaptive average pooling to (7,7)
        self.adaptive_pool = nn.AdaptiveAvgPool2d((7, 7))
        # Fully connected layer: input size = 32 * 7 * 7, output = 2 classes (cat, dog)
        self.fc = nn.Linear(32 * 7 * 7, 2)

    def forward(self, x):

        x = self.pool(F.relu(self.conv1(x)))

        x = self.pool(F.relu(self.conv2(x)))

        x = self.pool(F.relu(self.conv3(x)))

        x = self.adaptive_pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

Step 3: Implement Grad‑CAM¶

The GradCAM class registers hooks on a target layer (we use conv4) to capture:

  • The forward activations (feature maps).
  • The gradients during the backward pass.

The heatmap is computed as a weighted sum of the feature maps (using global average pooling of the gradients as weights), followed by ReLU and normalization.

In [33]:
class GradCAM:
    def __init__(self, model, target_layer):
        self.model = model
        self.target_layer = target_layer
        self.gradients = None
        self.activations = None
        self._register_hooks()

    def _register_hooks(self):
        def forward_hook(module, input, output):
            self.activations = output.detach()

        def backward_hook(module, grad_in, grad_out):
            self.gradients = grad_out[0].detach()

        self.target_layer.register_forward_hook(forward_hook)
        self.target_layer.register_backward_hook(backward_hook)

    def generate(self, input_tensor, target_class=None):
        output = self.model(input_tensor)
        if target_class is None:
            target_class = output.argmax(dim=1).item()

        self.model.zero_grad()
        score = output[0, target_class]
        score.backward()

        # global average pooling on gradients
        weights = self.gradients.mean(dim=[2, 3], keepdim=True)
        # weighted combination of activations
        cam = (weights * self.activations).sum(dim=1, keepdim=True)
        cam = F.relu(cam)
        # normalization
        cam = cam - cam.min()
        cam = cam / (cam.max() + 1e-8)
        cam = cam.squeeze().cpu().numpy()
        return cam, target_class

Step 4: Prepare Data¶

I use the downloaded Microsoft Cats vs Dogs dataset from Kaggle, which contains a large number of natural images divided into two folders: one for cats and one for dogs. Each class has varying backgrounds, poses, and lighting conditions.

To work with this data:

  • define a custom dataset class that assigns labels based on folder names ("Cat" = 0, "Dog" = 1).
  • Images are resized to 224×224 and normalized using ImageNet statistics.
  • The dataset is split into 80% training and 20% testing using random_split.
  • DataLoaders are constructed for efficient batching and shuffling during training and evaluation.
In [34]:
dataset_path = "/home/may33/.cache/kagglehub/datasets/shaunthesheep/microsoft-catsvsdogs-dataset/versions/1/PetImages"

train_dataset = CatDogKaggleDataset(root=dataset_path, transform=transform)

total_size = len(train_dataset)
train_size = int(0.8 * total_size)
val_size = total_size - train_size

train_subset, test_subset = random_split(train_dataset, [train_size, val_size])

batch_size = 256

train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_subset, batch_size=batch_size, shuffle=True)

def denormalize(img_tensor):
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    img = img_tensor.squeeze().cpu().permute(1, 2, 0).numpy()
    img = std * img + mean
    img = np.clip(img, 0, 1)
    return img

print("Total samples:", total_size)
print("Train samples:", len(train_subset))
print("Test samples:", len(test_subset))
Total samples: 24998
Train samples: 19998
Test samples: 5000
In [35]:
input_img = train_dataset[0][0]

img_np = denormalize(input_img)
In [36]:
plt.imshow(img_np)
plt.grid(False)
plt.show()
No description has been provided for this image
In [37]:
def train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs, scheduler=None):
    """
    Enhanced training function that tracks and reports all key metrics

    Args:
        model: Neural network model
        train_loader: DataLoader for training data
        test_loader: DataLoader for test data
        criterion: Loss function
        optimizer: Optimization algorithm
        num_epochs: Number of training epochs
        scheduler: Optional learning rate scheduler

    Returns:
        dict: Dictionary containing training history with all metrics
    """
    train_losses = []
    train_accuracies = []
    test_losses = []
    test_accuracies = []

    for epoch in range(1, num_epochs + 1):
        # Training phase
        model.train()
        running_loss = 0.0
        correct_train = 0
        total_train = 0

        for images, labels in train_loader:
            # Move data to GPU
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Calculate training accuracy
            _, predicted = torch.max(outputs, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()

            # Backward pass
            loss.backward()
            optimizer.step()

            # Accumulate loss
            running_loss += loss.item() * images.size(0)

        # Calculate training metrics
        epoch_loss = running_loss / len(train_loader.dataset)
        train_accuracy = 100 * correct_train / total_train
        train_losses.append(epoch_loss)
        train_accuracies.append(train_accuracy)

        # Evaluation phase
        model.eval()
        test_loss = 0.0
        correct_test = 0
        total_test = 0

        with torch.no_grad():
            for images, labels in test_loader:
                # Move data to GPU
                images, labels = images.to(device), labels.to(device)

                # Forward pass
                outputs = model(images)
                loss = criterion(outputs, labels)

                # Calculate test accuracy
                _, predicted = torch.max(outputs, 1)
                total_test += labels.size(0)
                correct_test += (predicted == labels).sum().item()

                # Accumulate loss
                test_loss += loss.item() * images.size(0)

        # Calculate test metrics
        epoch_test_loss = test_loss / len(test_loader.dataset)
        test_accuracy = 100 * correct_test / total_test
        test_losses.append(epoch_test_loss)
        test_accuracies.append(test_accuracy)

        # Scheduler step: for ReduceLROnPlateau, step using test loss; else, step normally.
        if scheduler:
            if isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau):
                scheduler.step(epoch_test_loss)
            else:
                scheduler.step()

        # Print all metrics
        print(f"Epoch {epoch}/{num_epochs} - "
              f"Train Loss: {epoch_loss:.4f}, Train Acc: {train_accuracy:.2f}%, "
              f"Test Loss: {epoch_test_loss:.4f}, Test Acc: {test_accuracy:.2f}%")

    return {
        'train_loss': train_losses,
        'train_acc': train_accuracies,
        'test_loss': test_losses,
        'test_acc': test_accuracies
    }
In [38]:
num_epochs = 20

model = CatVsDogCNN().to(device)
model.train()

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.2)
history = train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs, scheduler)
Epoch 1/20 - Train Loss: 0.6237, Train Acc: 64.33%, Test Loss: 0.5992, Test Acc: 67.52%
Epoch 2/20 - Train Loss: 0.5484, Train Acc: 72.05%, Test Loss: 0.5184, Test Acc: 74.74%
Epoch 3/20 - Train Loss: 0.4914, Train Acc: 76.39%, Test Loss: 0.6587, Test Acc: 67.68%
Epoch 4/20 - Train Loss: 0.4701, Train Acc: 78.05%, Test Loss: 0.4904, Test Acc: 76.28%
Epoch 5/20 - Train Loss: 0.4404, Train Acc: 79.84%, Test Loss: 0.4647, Test Acc: 79.14%
Epoch 6/20 - Train Loss: 0.3995, Train Acc: 82.06%, Test Loss: 0.4157, Test Acc: 81.86%
Epoch 7/20 - Train Loss: 0.3875, Train Acc: 82.95%, Test Loss: 0.4102, Test Acc: 82.50%
Epoch 8/20 - Train Loss: 0.3818, Train Acc: 83.21%, Test Loss: 0.4052, Test Acc: 82.00%
Epoch 9/20 - Train Loss: 0.3747, Train Acc: 83.55%, Test Loss: 0.4047, Test Acc: 83.04%
Epoch 10/20 - Train Loss: 0.3669, Train Acc: 83.96%, Test Loss: 0.3968, Test Acc: 83.56%
Epoch 11/20 - Train Loss: 0.3596, Train Acc: 84.43%, Test Loss: 0.3886, Test Acc: 83.40%
Epoch 12/20 - Train Loss: 0.3557, Train Acc: 84.68%, Test Loss: 0.3877, Test Acc: 83.62%
Epoch 13/20 - Train Loss: 0.3550, Train Acc: 84.73%, Test Loss: 0.3879, Test Acc: 83.42%
Epoch 14/20 - Train Loss: 0.3533, Train Acc: 84.76%, Test Loss: 0.3857, Test Acc: 83.86%
Epoch 15/20 - Train Loss: 0.3510, Train Acc: 85.10%, Test Loss: 0.3839, Test Acc: 83.50%
Epoch 16/20 - Train Loss: 0.3488, Train Acc: 85.04%, Test Loss: 0.3836, Test Acc: 83.48%
Epoch 17/20 - Train Loss: 0.3482, Train Acc: 85.01%, Test Loss: 0.3841, Test Acc: 83.40%
Epoch 18/20 - Train Loss: 0.3481, Train Acc: 85.08%, Test Loss: 0.3837, Test Acc: 83.88%
Epoch 19/20 - Train Loss: 0.3478, Train Acc: 85.06%, Test Loss: 0.3829, Test Acc: 83.80%
Epoch 20/20 - Train Loss: 0.3474, Train Acc: 85.12%, Test Loss: 0.3827, Test Acc: 83.70%
In [47]:
# Set model to evaluation mode.
model.eval()

# Instantiate GradCAM using the desired target layer
gradcam = GradCAM(model, target_layer=model.conv3)

num_images = 20
random_indices = random.sample(range(len(test_subset)), num_images)

for idx in random_indices:
    # Load a random image from the test dataset
    img, label = test_subset[idx]
    input_img = img.unsqueeze(0).to(device)

    # Generate Grad-CAM heatmaps for both class 0 (cat) and class 1 (dog)
    cam_cat, _ = gradcam.generate(input_img, target_class=0)
    cam_dog, _ = gradcam.generate(input_img, target_class=1)

    # Denormalize for visualization
    orig_img = denormalize(input_img)

    # Resize both CAMs
    cam_cat_resized = resize(cam_cat, (224, 224), mode='reflect', anti_aliasing=True)
    cam_dog_resized = resize(cam_dog, (224, 224), mode='reflect', anti_aliasing=True)

    # Plot original + both Grad-CAMs
    fig, axs = plt.subplots(1, 3, figsize=(16, 4))

    axs[0].imshow(orig_img)
    axs[0].set_title("Original Image")
    axs[0].axis('off')

    axs[1].imshow(orig_img)
    axs[1].imshow(cam_cat_resized, cmap='jet', alpha=0.5)
    axs[1].set_title("Grad-CAM: Cat (class 0)")
    axs[1].axis('off')

    axs[2].imshow(orig_img)
    axs[2].imshow(cam_dog_resized, cmap='jet', alpha=0.5)
    axs[2].set_title("Grad-CAM: Dog (class 1)")
    axs[2].axis('off')

    plt.tight_layout()
    plt.show()

    print(f"Sample idx {idx} | Ground Truth: {label} | Predicted: {model(input_img).argmax(dim=1).item()}")
No description has been provided for this image
Sample idx 3502 | Ground Truth: 0 | Predicted: 0
No description has been provided for this image
Sample idx 3046 | Ground Truth: 1 | Predicted: 1
No description has been provided for this image
Sample idx 4818 | Ground Truth: 1 | Predicted: 1
No description has been provided for this image
Sample idx 4177 | Ground Truth: 0 | Predicted: 1
No description has been provided for this image
Sample idx 1968 | Ground Truth: 1 | Predicted: 1
No description has been provided for this image
Sample idx 2005 | Ground Truth: 0 | Predicted: 0
No description has been provided for this image
Sample idx 2103 | Ground Truth: 1 | Predicted: 0
No description has been provided for this image
Sample idx 3789 | Ground Truth: 1 | Predicted: 1
No description has been provided for this image
Sample idx 4950 | Ground Truth: 1 | Predicted: 1
No description has been provided for this image
Sample idx 2626 | Ground Truth: 0 | Predicted: 1
No description has been provided for this image
Sample idx 3477 | Ground Truth: 1 | Predicted: 1
No description has been provided for this image
Sample idx 4008 | Ground Truth: 0 | Predicted: 0
No description has been provided for this image
Sample idx 3934 | Ground Truth: 1 | Predicted: 1
No description has been provided for this image
Sample idx 397 | Ground Truth: 1 | Predicted: 0
No description has been provided for this image
Sample idx 3605 | Ground Truth: 1 | Predicted: 1
No description has been provided for this image
Sample idx 2703 | Ground Truth: 1 | Predicted: 1
No description has been provided for this image
Sample idx 524 | Ground Truth: 0 | Predicted: 0
No description has been provided for this image
Sample idx 1597 | Ground Truth: 1 | Predicted: 0
No description has been provided for this image
Sample idx 2730 | Ground Truth: 0 | Predicted: 0
No description has been provided for this image
Sample idx 4507 | Ground Truth: 0 | Predicted: 0

Results¶

It was actually a lot of fun to visually explore these results. Grad-CAM turned out to be a great tool to understand how the model processes images. I ended up adding more and more examples just to see the patterns the model relies on.

Here are some key observations:

  • Grad-CAM does reveal meaningful regions associated with class decisions. The highlighted areas often correspond to intuitive and domain-specific features.

  • Cat-specific focus: For cats, the model frequently attends to large whiskers, the shape and position of the eyes (especially in kittens), and the patterns on the forehead and ears.

  • Dog-specific focus: For dogs, the attention is often drawn to the mouth area (especially when it’s open or the tongue is visible), curled ears, and overall head shape.

  • Accessory bias: The model often focuses heavily on collars, leashes, and similar accessories—strong indicators of “dog” in this dataset. This occasionally leads to misclassifications, such as labeling a cat with a toy or wearing a collar as a dog.

  • Contextual cues: Background elements also influence decisions. Dogs are more frequently shown in cages or outdoors (e.g., on grass), and the model seems to learn these patterns as latent features.

Overall, Grad-CAM was useful not only for explaining predictions but also for uncovering dataset biases and model shortcuts.